{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "from sklearn.model_selection import train_test_split\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### joblib保存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 57 s, sys: 54.5 s, total: 1min 51s\n",
      "Wall time: 1min 58s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 导入训练数据集\n",
    "data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])\n",
    "data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])\n",
    "major_df = joblib.load('../../preprocess_data_discrete/major_df.lz4').head(33465)\n",
    "# 以下3个数据代替原先的data_raw\n",
    "cat_df = joblib.load('../../preprocess_data_discrete/cat_df.lz4').head(33465)\n",
    "discrete_df = joblib.load('../../preprocess_data_discrete/discrete_df.lz4').head(33465)\n",
    "scale_df = joblib.load('../../preprocess_data_discrete/scale_df.lz4').head(33465)\n",
    "data_tag = pd.read_csv('../../preprocess_data/train_x_33465.csv',usecols=['tag'])\n",
    "maj_cnt_df = joblib.load('../../preprocess_data_discrete/maj_cnt_df.lz4').head(33465)\n",
    "\n",
    "data = pd.concat([data_date,data_null,major_df,cat_df,discrete_df,scale_df,data_tag,maj_cnt_df],axis=1)\n",
    "data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])\n",
    "x = data.fillna(-1).values\n",
    "y = data_label.values.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['./train_data.lz4']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "joblib.dump(data,'./train_data.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 8.92 s, sys: 860 ms, total: 9.78 s\n",
      "Wall time: 9.86 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# 导入测试数据\n",
    "valid_date = pd.read_csv('../../preprocess_data/valid_date.csv').drop(columns=['id','loan_hour'])\n",
    "valid_null = pd.read_csv('../../preprocess_data_new/valid_row_null.csv').drop(columns=['id'])\n",
    "major_test = joblib.load('../../preprocess_data_discrete/major_test.lz4')\n",
    "cat_test = joblib.load('../../preprocess_data_discrete/cat_test.lz4')\n",
    "discrete_test = joblib.load('../../preprocess_data_discrete/discrete_test.lz4')\n",
    "scale_test = joblib.load('../../preprocess_data_discrete/scale_test.lz4')\n",
    "valid_tag = pd.read_csv('../predict_tag/tag.csv',usecols=['tag'])\n",
    "maj_cnt_test = joblib.load('../../preprocess_data_discrete/maj_cnt_test.lz4')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "valid = pd.concat([valid_date,valid_null,major_test,cat_test,discrete_test,scale_test,valid_tag,maj_cnt_test],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.66 s, sys: 532 ms, total: 5.19 s\n",
      "Wall time: 9.37 s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['./valid_data.lz4']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "joblib.dump(valid,'./valid_data.lz4',compress='lz4')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 加载数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 9.3 s, sys: 11.1 s, total: 20.4 s\n",
      "Wall time: 20.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# train\n",
    "data = joblib.load('./train_data.lz4')\n",
    "data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])\n",
    "x = data.fillna(-1).values\n",
    "y = data_label.values.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 844 ms, sys: 776 ms, total: 1.62 s\n",
      "Wall time: 1.62 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# test\n",
    "valid = joblib.load('./valid_data.lz4')\n",
    "x_test = valid.fillna(-1).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 线上测试\n",
    "x_train = x\n",
    "y_train = y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SelectModel(model_name):\n",
    "    if model_name == 'GBC':\n",
    "        from sklearn.ensemble import GradientBoostingClassifier\n",
    "        model = GradientBoostingClassifier(loss='deviance',\n",
    "                                           learning_rate =0.1,\n",
    "                                           n_estimators=300,\n",
    "                                           subsample=0.9,\n",
    "                                           max_depth=3,\n",
    "#                                            verbose=1,\n",
    "                                          random_state=2018)\n",
    "    elif model_name == 'XGB':\n",
    "        from xgboost import XGBClassifier\n",
    "\n",
    "        model = XGBClassifier(max_depth=6,\n",
    "                              learning_rate =0.04, \n",
    "                              booster='gbtree',\n",
    "                              objective='binary:logistic',\n",
    "                              early_stopping_rounds=100,\n",
    "                              scale_pos_weight=float(len(y_train)-np.sum(y_train))/float(np.sum(y_train)),\n",
    "                              eval_metric='auc',\n",
    "                              gamma=1,\n",
    "                              reg_lambda=1,\n",
    "                              subsample=0.9,\n",
    "                              min_child_weight=1,\n",
    "                              seed=2018,\n",
    "                              silent=False,\n",
    "                              n_jobs=24,\n",
    "                              num_boost_round =400\n",
    "                             )\n",
    "    elif model_name == 'RFC':\n",
    "        from sklearn.ensemble import RandomForestClassifier\n",
    "        model = RandomForestClassifier(n_estimators=1500,\n",
    "                                       n_jobs =36,\n",
    "                                       max_features='sqrt',\n",
    "                                       class_weight='balanced',\n",
    "                                       verbose =1,\n",
    "                                       random_state=2018)\n",
    "    elif model_name == 'LGB':\n",
    "        from lightgbm import LGBMClassifier\n",
    "        model = LGBMClassifier(boost='gbdt',\n",
    "                    num_leaves=135, \n",
    "                    scale_pos_weight=float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),\n",
    "                    max_depth=-1,\n",
    "                    learning_rate=.04,\n",
    "                    max_bin=200,\n",
    "                    min_data_in_leaf= 60,\n",
    "                    objective='binary',\n",
    "                    metric='auc',\n",
    "                    num_threads=32,\n",
    "                    slient=False,\n",
    "                    num_boost_round =400)\n",
    "    else:\n",
    "        pass\n",
    "    return model\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=36)]: Using backend ThreadingBackend with 36 concurrent workers.\n",
      "[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    5.4s\n",
      "[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:   14.2s\n",
      "[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:   26.2s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "rfc_model = SelectModel('RFC')\n",
    "rfc_model.fit(x_train[:,:10800],y_train)\n",
    "joblib.dump(rfc_model,'rfc_model')\n",
    "\n",
    "lgb_model = SelectModel('LGB')\n",
    "lgb_model.fit(x_train,y_train)\n",
    "joblib.dump(lgb_model,'lgb_model')\n",
    "\n",
    "xgb_model = SelectModel('XGB')\n",
    "xgb_model.fit(x_train,y_train)\n",
    "joblib.dump(xgb_model,'xgb_model')\n",
    "\n",
    "gbc_model = SelectModel('GBC')\n",
    "gbc_model.fit(x_train,y_train)\n",
    "joblib.dump(gbc_model,'gbc_model')\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### voting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_id = pd.read_csv('../../preprocess_data_new/valid_date.csv',usecols=['id']).values.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "rfc_model = joblib.load('rfc_model')\n",
    "lgb_model = joblib.load('lgb_model')\n",
    "xgb_model = joblib.load('xgb_model')\n",
    "gbc_model = joblib.load('gbc_model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=36)]: Using backend ThreadingBackend with 36 concurrent workers.\n",
      "[Parallel(n_jobs=36)]: Done 128 tasks      | elapsed:    0.2s\n",
      "[Parallel(n_jobs=36)]: Done 378 tasks      | elapsed:    0.3s\n",
      "[Parallel(n_jobs=36)]: Done 728 tasks      | elapsed:    0.6s\n",
      "[Parallel(n_jobs=36)]: Done 1178 tasks      | elapsed:    0.9s\n",
      "[Parallel(n_jobs=36)]: Done 1500 out of 1500 | elapsed:    1.1s finished\n"
     ]
    }
   ],
   "source": [
    "rfc_pred = rfc_model.predict_proba(x_test[:,:10800])[:,1]\n",
    "lgb_pred = lgb_model.predict_proba(x_test)[:,1]\n",
    "xgb_pred = xgb_model.predict_proba(x_test)[:,1]\n",
    "gbc_pred = gbc_model.predict_proba(x_test)[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred = pd.DataFrame()\n",
    "pred['id'] = test_id\n",
    "pred['prob'] = (rfc_pred + lgb_pred + xgb_pred + gbc_pred)/4\n",
    "import os\n",
    "if not os.path.exists('./pred'):\n",
    "    os.mkdir('pred')\n",
    "pred.to_csv('./pred/voting_pred.txt',index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "voting_pred AUC:0.8121  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
